from google.colab import drive
drive.mount('/content/drive')
Mounted at /content/drive
# Utilities
from time import time
from PIL import Image
from zipfile import ZipFile
import os, sys, itertools, re
import warnings, pickle, string
!pip install ftfy
from ftfy import fix_encoding, fix_text, badness
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
# Translation APIs
!pip install goslate
from goslate import Goslate
# Numerical calculation
import numpy as np
# Data Handling
import pandas as pd
# Data Visualization
import matplotlib.pyplot as plt
import seaborn as sns
import cufflinks as cf
import plotly as py
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs,init_notebook_mode,plot,iplot
# Sequential Modeling
import keras.backend as K
from keras.datasets import imdb
from keras.models import Sequential, Model
from keras.layers.merge import Concatenate
from keras.layers import Input, Dropout, Flatten, Dense, Embedding, LSTM, GRU
from keras.layers import BatchNormalization, TimeDistributed, Conv1D, MaxPooling1D
from keras.constraints import max_norm, unit_norm
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences
from keras.callbacks import EarlyStopping, ModelCheckpoint
# Traditional Modeling
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.svm import SVC, LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
# Tools & Evaluation metrics
from sklearn.metrics import confusion_matrix, classification_report, auc
from sklearn.metrics import roc_curve, accuracy_score, precision_recall_curve
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
# NLP toolkits
import spacy
import nltk
from nltk import tokenize
# Configure for any default setting of any library
warnings.filterwarnings('ignore')
get_ipython().magic(u'matplotlib inline')
plt.style.use('ggplot')
init_notebook_mode(connected=True)
cf.go_offline()
%matplotlib inline
Collecting ftfy
Downloading ftfy-6.1.1-py3-none-any.whl (53 kB)
|████████████████████████████████| 53 kB 844 kB/s
Requirement already satisfied: wcwidth>=0.2.5 in /usr/local/lib/python3.7/dist-packages (from ftfy) (0.2.5)
Installing collected packages: ftfy
Successfully installed ftfy-6.1.1
Collecting goslate
Downloading goslate-1.5.2.tar.gz (16 kB)
Collecting futures
Downloading futures-3.0.5.tar.gz (25 kB)
WARNING: Discarding https://files.pythonhosted.org/packages/55/db/97c1ca37edab586a1ae03d6892b6633d8eaa23b23ac40c7e5bbc55423c78/futures-3.0.5.tar.gz#sha256=0542525145d5afc984c88f914a0c85c77527f65946617edb5274f72406f981df (from https://pypi.org/simple/futures/). Command errored out with exit status 1: python setup.py egg_info Check the logs for full command output.
Downloading futures-3.0.4.tar.gz (25 kB)
WARNING: Discarding https://files.pythonhosted.org/packages/8d/73/b5fff618482bc06c9711e7cdc0d5d7eb1904d35898f48f2d7f9696b08bef/futures-3.0.4.tar.gz#sha256=19485d83f7bd2151c0aeaf88fbba3ee50dadfb222ffc3b66a344ef4952b782a3 (from https://pypi.org/simple/futures/). Command errored out with exit status 1: python setup.py egg_info Check the logs for full command output.
Downloading futures-3.0.3.tar.gz (24 kB)
WARNING: Discarding https://files.pythonhosted.org/packages/4c/dc/f9473006d4c9c52d4a4e977173fbcbfb1a8ef3a57e32e885edf994fd4a45/futures-3.0.3.tar.gz#sha256=2fe2342bb4fe8b8e217f0d21b5921cbe5408bf966d9f92025e707e881b198bed (from https://pypi.org/simple/futures/). Command errored out with exit status 1: python setup.py egg_info Check the logs for full command output.
Downloading futures-3.0.2.tar.gz (24 kB)
WARNING: Discarding https://files.pythonhosted.org/packages/f8/e7/fc0fcbeb9193ba2d4de00b065e7fd5aecd0679e93ce95a07322b2b1434f4/futures-3.0.2.tar.gz#sha256=dc3fc91508e49e0fd2f8625f0132d16e49c80f882e7e1d565c56b0d5dfbae257 (from https://pypi.org/simple/futures/). Command errored out with exit status 1: python setup.py egg_info Check the logs for full command output.
Downloading futures-3.0.1.tar.gz (24 kB)
WARNING: Discarding https://files.pythonhosted.org/packages/b2/2c/6b6a57379e47031c6f52e625e0e2b8f6702a8d1f61b6e0daee391e82c187/futures-3.0.1.tar.gz#sha256=f78f2ef458639d72a625cf9c7643cf5442bb222ac11c12bcc445c6ad1cd862e2 (from https://pypi.org/simple/futures/). Command errored out with exit status 1: python setup.py egg_info Check the logs for full command output.
Downloading futures-3.0.0.tar.gz (24 kB)
WARNING: Discarding https://files.pythonhosted.org/packages/ea/c9/35287369718fc05059e7a9d0d73c53745fe981010b4185b3858e7d46eff1/futures-3.0.0.tar.gz#sha256=d9cd7bb09aa01f0e4940af64c31fbd7045098b7b4354420d7838ea39e8b86ee3 (from https://pypi.org/simple/futures/). Command errored out with exit status 1: python setup.py egg_info Check the logs for full command output.
Downloading futures-2.2.0-py2.py3-none-any.whl (16 kB)
Building wheels for collected packages: goslate
Building wheel for goslate (setup.py) ... done
Created wheel for goslate: filename=goslate-1.5.2-py3-none-any.whl size=11436 sha256=760b037f96f37e2e4c5835a7d3d05f30ac2d8319206e748eda605cfe89c24a6b
Stored in directory: /root/.cache/pip/wheels/a8/8a/c4/85425eac5e0746fd5fc898801858331e55ac386f476d65e58d
Successfully built goslate
Installing collected packages: futures, goslate
Successfully installed futures-2.2.0 goslate-1.5.2
if 'google.colab' in sys.modules:
project_path = "/content/drive/My Drive/Colab Notebooks/"
from google.colab import drive
drive.mount('/content/drive/', force_remount=True)
sys.path.append(project_path)
%cd $project_path
print('Current working directory', os.getcwd())
Mounted at /content/drive/ /content/drive/My Drive/Colab Notebooks Current working directory /content/drive/My Drive/Colab Notebooks
df = pd.read_excel('input_data.xlsx', )
df.head()
| Short description | Description | Caller | Assignment group | |
|---|---|---|---|---|
| 0 | login issue | -verified user details.(employee# & manager na... | spxjnwir pjlcoqds | GRP_0 |
| 1 | outlook | _x000D_\n_x000D_\nreceived from: hmjdrvpb.komu... | hmjdrvpb komuaywn | GRP_0 |
| 2 | cant log in to vpn | _x000D_\n_x000D_\nreceived from: eylqgodm.ybqk... | eylqgodm ybqkwiam | GRP_0 |
| 3 | unable to access hr_tool page | unable to access hr_tool page | xbkucsvz gcpydteq | GRP_0 |
| 4 | skype error | skype error | owlgqjme qhcozdfx | GRP_0 |
df.tail()
| Short description | Description | Caller | Assignment group | |
|---|---|---|---|---|
| 8495 | emails not coming in from zz mail | _x000D_\n_x000D_\nreceived from: avglmrts.vhqm... | avglmrts vhqmtiua | GRP_29 |
| 8496 | telephony_software issue | telephony_software issue | rbozivdq gmlhrtvp | GRP_0 |
| 8497 | vip2: windows password reset for tifpdchb pedx... | vip2: windows password reset for tifpdchb pedx... | oybwdsgx oxyhwrfz | GRP_0 |
| 8498 | machine não está funcionando | i am unable to access the machine utilities to... | ufawcgob aowhxjky | GRP_62 |
| 8499 | an mehreren pc`s lassen sich verschiedene prgr... | an mehreren pc`s lassen sich verschiedene prgr... | kqvbrspl jyzoklfx | GRP_49 |
print('No of rows:\033[1m', df.shape[0], '\033[0m')
print('No of cols:\033[1m', df.shape[1], '\033[0m')
No of rows: 8500 No of cols: 4
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 8500 entries, 0 to 8499 Data columns (total 4 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Short description 8492 non-null object 1 Description 8499 non-null object 2 Caller 8500 non-null object 3 Assignment group 8500 non-null object dtypes: object(4) memory usage: 265.8+ KB
df.describe()
| Short description | Description | Caller | Assignment group | |
|---|---|---|---|---|
| count | 8492 | 8499 | 8500 | 8500 |
| unique | 7481 | 7817 | 2950 | 74 |
| top | password reset | the | bpctwhsn kzqsbmtp | GRP_0 |
| freq | 38 | 56 | 810 | 3976 |
df[df.Description == 'the'].head()
| Short description | Description | Caller | Assignment group | |
|---|---|---|---|---|
| 1049 | reset passwords for soldfnbq uhnbsvqd using pa... | the | soldfnbq uhnbsvqd | GRP_17 |
| 1054 | reset passwords for fygrwuna gomcekzi using pa... | the | fygrwuna gomcekzi | GRP_17 |
| 1144 | reset passwords for wvdxnkhf jirecvta using pa... | the | wvdxnkhf jirecvta | GRP_17 |
| 1184 | reset passwords for pxvjczdt kizsjfpq using pa... | the | pxvjczdt kizsjfpq | GRP_17 |
| 1292 | reset passwords for cubdsrml znewqgop using pa... | the | cubdsrml znewqgop | GRP_17 |
df.isnull().sum()
Short description 8 Description 1 Caller 0 Assignment group 0 dtype: int64
df[pd.isnull(df).any(axis=1)]
| Short description | Description | Caller | Assignment group | |
|---|---|---|---|---|
| 2604 | NaN | _x000D_\n_x000D_\nreceived from: ohdrnswl.rezu... | ohdrnswl rezuibdt | GRP_34 |
| 3383 | NaN | _x000D_\n-connected to the user system using t... | qftpazns fxpnytmk | GRP_0 |
| 3906 | NaN | -user unable tologin to vpn._x000D_\n-connect... | awpcmsey ctdiuqwe | GRP_0 |
| 3910 | NaN | -user unable tologin to vpn._x000D_\n-connect... | rhwsmefo tvphyura | GRP_0 |
| 3915 | NaN | -user unable tologin to vpn._x000D_\n-connect... | hxripljo efzounig | GRP_0 |
| 3921 | NaN | -user unable tologin to vpn._x000D_\n-connect... | cziadygo veiosxby | GRP_0 |
| 3924 | NaN | name:wvqgbdhm fwchqjor\nlanguage:\nbrowser:mic... | wvqgbdhm fwchqjor | GRP_0 |
| 4341 | NaN | _x000D_\n_x000D_\nreceived from: eqmuniov.ehxk... | eqmuniov ehxkcbgj | GRP_0 |
| 4395 | i am locked out of skype | NaN | viyglzfo ajtfzpkb | GRP_0 |
df.isnull().sum()
Short description 8 Description 1 Caller 0 Assignment group 0 dtype: int64
df_copy=df.astype(str)
duplicateRows_df=df_copy[df_copy.duplicated()]
print(duplicateRows_df)
print(duplicateRows_df.sum())
Short description \
51 call for ecwtrjnq jpecxuty
229 call for ecwtrjnq jpecxuty
493 ticket update on inplant_872730
512 blank call //gso
667 job bkbackup_tool_powder_prod_full failed in j...
... ...
7836 probleme mit erpgui \tmqfjard qzhgdoua
8051 issue on pricing in distributor_tool
8093 reset passwords for prgthyuulla ramdntythanjes...
8347 blank call // loud noise
8405 unable to launch outlook
Description Caller \
51 call for ecwtrjnq jpecxuty olckhmvx pcqobjnd
229 call for ecwtrjnq jpecxuty olckhmvx pcqobjnd
493 ticket update on inplant_872730 fumkcsji sarmtlhy
512 blank call //gso rbozivdq gmlhrtvp
667 received from: monitoring_tool@company.com_x00... bpctwhsn kzqsbmtp
... ... ...
7836 probleme mit erpgui \tmqfjard qzhgdoua tmqfjard qzhgdoua
8051 we have agreed price with many of the distribu... hbmwlprq ilfvyodx
8093 the boirqctx bkijgqry
8347 blank call // loud noise rbozivdq gmlhrtvp
8405 unable to launch outlook wjtzrmqc ikqpbflg
Assignment group
51 GRP_0
229 GRP_0
493 GRP_0
512 GRP_0
667 GRP_8
... ...
7836 GRP_24
8051 GRP_21
8093 GRP_17
8347 GRP_0
8405 GRP_0
[83 rows x 4 columns]
Short description call for ecwtrjnq jpecxutycall for ecwtrjnq jp...
Description call for ecwtrjnq jpecxutycall for ecwtrjnq jp...
Caller olckhmvx pcqobjndolckhmvx pcqobjndfumkcsji sar...
Assignment group GRP_0GRP_0GRP_0GRP_0GRP_8GRP_0GRP_8GRP_0GRP_0G...
dtype: object
def rm_duplicate(text):
text.drop_duplicates(['Short description', 'Description', 'Caller', 'Assignment group'],inplace=True)
return text
rm_duplicate(df_copy)
| Short description | Description | Caller | Assignment group | |
|---|---|---|---|---|
| 0 | login issue | -verified user details.(employee# & manager na... | spxjnwir pjlcoqds | GRP_0 |
| 1 | outlook | _x000D_\n_x000D_\nreceived from: hmjdrvpb.komu... | hmjdrvpb komuaywn | GRP_0 |
| 2 | cant log in to vpn | _x000D_\n_x000D_\nreceived from: eylqgodm.ybqk... | eylqgodm ybqkwiam | GRP_0 |
| 3 | unable to access hr_tool page | unable to access hr_tool page | xbkucsvz gcpydteq | GRP_0 |
| 4 | skype error | skype error | owlgqjme qhcozdfx | GRP_0 |
| ... | ... | ... | ... | ... |
| 8495 | emails not coming in from zz mail | _x000D_\n_x000D_\nreceived from: avglmrts.vhqm... | avglmrts vhqmtiua | GRP_29 |
| 8496 | telephony_software issue | telephony_software issue | rbozivdq gmlhrtvp | GRP_0 |
| 8497 | vip2: windows password reset for tifpdchb pedx... | vip2: windows password reset for tifpdchb pedx... | oybwdsgx oxyhwrfz | GRP_0 |
| 8498 | machine não está funcionando | i am unable to access the machine utilities to... | ufawcgob aowhxjky | GRP_62 |
| 8499 | an mehreren pc`s lassen sich verschiedene prgr... | an mehreren pc`s lassen sich verschiedene prgr... | kqvbrspl jyzoklfx | GRP_49 |
8417 rows × 4 columns
We can address NULL/Missing values in the dataset in a variety of ways, including:
df['Assignment group'].value_counts().plot(kind='bar', figsize=(20,10), title='Class Label Distribution')
<matplotlib.axes._subplots.AxesSubplot at 0x7f8d0d9f5990>
Mojibake
Mojibake is the garbled text that is the result of text being decoded using an unintended character encoding. The result is a systematic replacement of symbols with completely unrelated ones, often from a different writing system.
This display may include the generic replacement character ("�") in places where the binary representation is considered invalid. A replacement can also involve multiple consecutive symbols, as viewed in one encoding, when the same binary code constitutes one symbol in the other encoding. This is either because of differing constant length encoding (as in Asian 16-bit encodings vs European 8-bit encodings), or the use of variable length encodings (notably UTF-8 and UTF-16). Few such Mojibakes are ¶, ç, å, €, æ, œ, º, ‡, ¼, ¥ etc.
As we're dealing with Natural Language and the source of the data is unknown to us, let's run the encoding check to figure out if the dataset is Mojibake impacted.
The library ftfy (Fixes Text For You) has a greater ability to detect, fix and deal with such Mojibakes. It fixes Unicode that’s broken in various ways. The goal of ftfy is to take in bad Unicode and output good Unicode.
MOJIBAKE_CATEGORIES = {
"common": (
"\N{NO-BREAK SPACE}"
"\N{SOFT HYPHEN}"
"\N{MIDDLE DOT}"
"\N{ACUTE ACCENT}"
"\N{EN DASH}"
"\N{EM DASH}"
"\N{HORIZONTAL BAR}"
"\N{HORIZONTAL ELLIPSIS}"
"\N{RIGHT SINGLE QUOTATION MARK}"
),
"c1": "\x80-\x9f",
"bad": (
"\N{BROKEN BAR}"
"\N{CURRENCY SIGN}"
"\N{DIAERESIS}"
"\N{NOT SIGN}"
"\N{MACRON}"
"\N{PILCROW SIGN}"
"\N{SECTION SIGN}"
"\N{CEDILLA}"
"\N{LATIN SMALL LETTER F WITH HOOK}"
"\N{MODIFIER LETTER CIRCUMFLEX ACCENT}"
"\N{CARON}"
"\N{BREVE}"
"\N{OGONEK}"
"\N{SMALL TILDE}"
"\N{DAGGER}"
"\N{DOUBLE DAGGER}"
"\N{PER MILLE SIGN}"
"\N{REVERSED NOT SIGN}"
"\N{LOZENGE}"
"\ufffd"
"\N{FEMININE ORDINAL INDICATOR}"
"\N{MASCULINE ORDINAL INDICATOR}"
),
"currency": (
"\N{CENT SIGN}"
"\N{POUND SIGN}"
"\N{YEN SIGN}"
"\N{PESETA SIGN}"
"\N{EURO SIGN}"
),
"start_punctuation": (
"\N{INVERTED EXCLAMATION MARK}"
"\N{LEFT-POINTING DOUBLE ANGLE QUOTATION MARK}"
"\N{INVERTED QUESTION MARK}"
"\N{COPYRIGHT SIGN}"
"\N{GREEK TONOS}"
"\N{GREEK DIALYTIKA TONOS}"
"\N{LEFT SINGLE QUOTATION MARK}"
"\N{SINGLE LOW-9 QUOTATION MARK}"
"\N{LEFT DOUBLE QUOTATION MARK}"
"\N{DOUBLE LOW-9 QUOTATION MARK}"
"\N{BULLET}"
"\N{SINGLE LEFT-POINTING ANGLE QUOTATION MARK}"
"\uf8ff"
),
"end_punctuation": (
"\N{REGISTERED SIGN}"
"\N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
"\N{DOUBLE ACUTE ACCENT}"
"\N{RIGHT DOUBLE QUOTATION MARK}"
"\N{SINGLE RIGHT-POINTING ANGLE QUOTATION MARK}"
"\N{TRADE MARK SIGN}"
),
"numeric": (
"\N{SUPERSCRIPT TWO}"
"\N{SUPERSCRIPT THREE}"
"\N{SUPERSCRIPT ONE}"
"\N{PLUS-MINUS SIGN}"
"\N{VULGAR FRACTION ONE QUARTER}"
"\N{VULGAR FRACTION ONE HALF}"
"\N{VULGAR FRACTION THREE QUARTERS}"
"\N{MULTIPLICATION SIGN}"
"\N{MICRO SIGN}"
"\N{DIVISION SIGN}"
"\N{FRACTION SLASH}"
"\N{PARTIAL DIFFERENTIAL}"
"\N{INCREMENT}"
"\N{N-ARY PRODUCT}"
"\N{N-ARY SUMMATION}"
"\N{SQUARE ROOT}"
"\N{INFINITY}"
"\N{INTERSECTION}"
"\N{INTEGRAL}"
"\N{ALMOST EQUAL TO}"
"\N{NOT EQUAL TO}"
"\N{IDENTICAL TO}"
"\N{LESS-THAN OR EQUAL TO}"
"\N{GREATER-THAN OR EQUAL TO}"
"\N{NUMERO SIGN}"
),
"kaomoji": (
"Ò-Ö"
"Ù-Ü"
"ò-ö"
"ø-ü"
"\N{LATIN CAPITAL LETTER O WITH DOUBLE ACUTE}"
"\N{DEGREE SIGN}"
),
"upper_accented": (
"\xc0-\xd1"
"\N{LATIN CAPITAL LETTER O WITH STROKE}"
"\N{LATIN CAPITAL LETTER U WITH DIAERESIS}"
"\N{LATIN CAPITAL LETTER Y WITH ACUTE}"
"\N{LATIN CAPITAL LETTER A WITH BREVE}"
"\N{LATIN CAPITAL LETTER A WITH OGONEK}"
"\N{LATIN CAPITAL LETTER C WITH ACUTE}"
"\N{LATIN CAPITAL LETTER C WITH CARON}"
"\N{LATIN CAPITAL LETTER D WITH CARON}"
"\N{LATIN CAPITAL LETTER D WITH STROKE}"
"\N{LATIN CAPITAL LETTER E WITH OGONEK}"
"\N{LATIN CAPITAL LETTER E WITH CARON}"
"\N{LATIN CAPITAL LETTER G WITH BREVE}"
"\N{LATIN CAPITAL LETTER I WITH DOT ABOVE}"
"\N{LATIN CAPITAL LETTER L WITH ACUTE}"
"\N{LATIN CAPITAL LETTER L WITH CARON}"
"\N{LATIN CAPITAL LETTER L WITH STROKE}"
"\N{LATIN CAPITAL LETTER N WITH ACUTE}"
"\N{LATIN CAPITAL LETTER N WITH CARON}"
"\N{LATIN CAPITAL LIGATURE OE}"
"\N{LATIN CAPITAL LETTER R WITH CARON}"
"\N{LATIN CAPITAL LETTER S WITH ACUTE}"
"\N{LATIN CAPITAL LETTER S WITH CEDILLA}"
"\N{LATIN CAPITAL LETTER S WITH CARON}"
"\N{LATIN CAPITAL LETTER T WITH CEDILLA}"
"\N{LATIN CAPITAL LETTER T WITH CARON}"
"\N{LATIN CAPITAL LETTER U WITH RING ABOVE}"
"\N{LATIN CAPITAL LETTER U WITH DOUBLE ACUTE}"
"\N{LATIN CAPITAL LETTER Y WITH DIAERESIS}"
"\N{LATIN CAPITAL LETTER Z WITH ACUTE}"
"\N{LATIN CAPITAL LETTER Z WITH DOT ABOVE}"
"\N{LATIN CAPITAL LETTER Z WITH CARON}"
"\N{CYRILLIC CAPITAL LETTER GHE WITH UPTURN}"
),
"lower_accented": (
"\N{LATIN SMALL LETTER SHARP S}"
"\xe0-\xf1"
"\N{LATIN SMALL LETTER A WITH BREVE}"
"\N{LATIN SMALL LETTER A WITH OGONEK}"
"\N{LATIN SMALL LETTER C WITH ACUTE}"
"\N{LATIN SMALL LETTER C WITH CARON}"
"\N{LATIN SMALL LETTER D WITH CARON}"
"\N{LATIN SMALL LETTER D WITH STROKE}"
"\N{LATIN SMALL LETTER E WITH OGONEK}"
"\N{LATIN SMALL LETTER E WITH CARON}"
"\N{LATIN SMALL LETTER G WITH BREVE}"
"\N{LATIN SMALL LETTER L WITH ACUTE}"
"\N{LATIN SMALL LETTER L WITH CARON}"
"\N{LATIN SMALL LETTER L WITH STROKE}"
"\N{LATIN SMALL LIGATURE OE}"
"\N{LATIN SMALL LETTER R WITH ACUTE}"
"\N{LATIN SMALL LETTER S WITH ACUTE}"
"\N{LATIN SMALL LETTER S WITH CEDILLA}"
"\N{LATIN SMALL LETTER S WITH CARON}"
"\N{LATIN SMALL LETTER T WITH CARON}"
"\N{LATIN SMALL LETTER U WITH DIAERESIS}"
"\N{LATIN SMALL LETTER Z WITH ACUTE}"
"\N{LATIN SMALL LETTER Z WITH DOT ABOVE}"
"\N{LATIN SMALL LETTER Z WITH CARON}"
"\N{CYRILLIC SMALL LETTER GHE WITH UPTURN}"
"\N{LATIN SMALL LIGATURE FI}"
"\N{LATIN SMALL LIGATURE FL}"
),
"upper_common": (
"\N{LATIN CAPITAL LETTER THORN}"
"\N{GREEK CAPITAL LETTER ALPHA}-\N{GREEK CAPITAL LETTER OMEGA}"
"\N{GREEK CAPITAL LETTER ALPHA WITH TONOS}"
"\N{GREEK CAPITAL LETTER EPSILON WITH TONOS}"
"\N{GREEK CAPITAL LETTER ETA WITH TONOS}"
"\N{GREEK CAPITAL LETTER IOTA WITH TONOS}"
"\N{GREEK CAPITAL LETTER OMICRON WITH TONOS}"
"\N{GREEK CAPITAL LETTER UPSILON WITH TONOS}"
"\N{GREEK CAPITAL LETTER OMEGA WITH TONOS}"
"\N{GREEK CAPITAL LETTER IOTA WITH DIALYTIKA}"
"\N{GREEK CAPITAL LETTER UPSILON WITH DIALYTIKA}"
"\N{CYRILLIC CAPITAL LETTER IO}-\N{CYRILLIC CAPITAL LETTER YA}"
),
"lower_common": (
"\N{GREEK SMALL LETTER ALPHA}-\N{GREEK SMALL LETTER OMEGA}"
"\N{GREEK SMALL LETTER ALPHA WITH TONOS}"
"\N{GREEK SMALL LETTER EPSILON WITH TONOS}"
"\N{GREEK SMALL LETTER ETA WITH TONOS}"
"\N{GREEK SMALL LETTER IOTA WITH TONOS}"
"\N{GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS}"
"\N{CYRILLIC SMALL LETTER A}-\N{CYRILLIC SMALL LETTER DZHE}"
),
"box": (
"│┌┐┘├┤┬┼"
"\N{BOX DRAWINGS DOUBLE HORIZONTAL}-\N{BOX DRAWINGS DOUBLE VERTICAL AND HORIZONTAL}"
"▀▄█▌▐░▒▓"
),
}
import warnings
import re
def sequence_weirdness(text):
"""
This was the name of the heuristic used in ftfy 2.x through 5.x. As an
attempt at compatibility with external code that calls the heuristic
directly, we redirect to our new heuristic, :func:`badness`.
"""
warnings.warn(
"`sequence_weirdness()` is an old heuristic, and the current "
"closest equivalent is `ftfy.badness.badness()`"
)
return badness(text)
BADNESS_RE = re.compile(
r"""
[{c1}]
|
[{bad}{lower_accented}{upper_accented}{box}{start_punctuation}{end_punctuation}{currency}{numeric}] [{bad}]
|
[a-zA-Z] [{lower_common}{upper_common}] [{bad}]
|
[{bad}] [{lower_accented}{upper_accented}{box}{start_punctuation}{end_punctuation}{currency}{numeric}]
|
[{lower_accented}{lower_common}{box}{end_punctuation}{currency}{numeric}] [{upper_accented}]
|
[{box}{end_punctuation}{currency}{numeric}] [{lower_accented}]
|
# leave out [upper_accented][currency] without further info, because it's used in some
# fancy leetspeak-esque writing
[{lower_accented}{box}{end_punctuation}] [{currency}]
|
\s [{upper_accented}] [{currency}]
|
[{upper_accented}{box}] [{numeric}]
|
[{lower_accented}{upper_accented}{box}{currency}{end_punctuation}] [{start_punctuation}] [{numeric}]
|
[{lower_accented}{upper_accented}{currency}{numeric}{box}] [{end_punctuation}] [{start_punctuation}]
|
[{currency}{numeric}{box}] [{start_punctuation}]
|
[a-z] [{upper_accented}] [{start_punctuation}{currency}]
|
[{box}] [{kaomoji}]
|
[{lower_accented}{upper_accented}{currency}{numeric}{start_punctuation}{end_punctuation}] [{box}]
|
[{box}] [{end_punctuation}]
|
[{lower_accented}{upper_accented}] [{end_punctuation}] \w
|
# The ligature œ when not followed by an unaccented Latin letter
[Œœ][^A-Za-z]
|
# Common Windows-1252 2-character mojibake that isn't covered by the cases above
[ÂÃÎÐ][€Šš¢£Ÿž\xa0\xad®©°·»{start_punctuation}{end_punctuation}–—´]
|
× [²³]
|
# Windows-1252 mojibake of Arabic words needs to include the 'common' characters.
# To compensate, we require four characters to be matched.
[ØÙ] [{common}{currency}{bad}{numeric}{start_punctuation}ŸŠ®°µ»]
[ØÙ] [{common}{currency}{bad}{numeric}{start_punctuation}ŸŠ®°µ»]
|
# Windows-1252 mojibake that starts 3-character sequences for some South Asian
# alphabets
à[²µ¹¼½¾]
|
# MacRoman mojibake that isn't covered by the cases above
√[±∂†≠®™´≤≥¥µø]
|
≈[°¢]
|
‚Ä[ìîïòôúùû†°¢π]
|
‚[âó][àä°ê]
|
# Windows-1251 mojibake of characters in the U+2000 range
вЂ
|
# Windows-1251 mojibake of Latin-1 characters and/or the Cyrillic alphabet.
# Because the 2-character sequences involved here may be common, we require
# seeing a 3-character sequence.
[ВГРС][{c1}{bad}{start_punctuation}{end_punctuation}{currency}°µ][ВГРС]
|
# A distinctive five-character sequence of Cyrillic letters, which can be
# Windows-1251 mojibake on top of Latin-1 mojibake of Windows-1252 characters.
# Require a Latin letter nearby.
ГўВЂВ.[A-Za-z ]
|
# Windows-1252 encodings of 'à' and 'á', as well as \xa0 itself
Ã[\xa0¡]
|
[a-z]\s?[ÃÂ][ ]
|
^[ÃÂ][ ]
|
# Cases where  precedes a character as an encoding of exactly the same
# character, and the character is common enough
[a-z.,?!{end_punctuation}] Â [ {start_punctuation}{end_punctuation}]
|
# Windows-1253 mojibake of characters in the U+2000 range
β€[™\xa0Ά\xad®°]
|
# Windows-1253 mojibake of Latin-1 characters and/or the Greek alphabet
[ΒΓΞΟ][{c1}{bad}{start_punctuation}{end_punctuation}{currency}°][ΒΓΞΟ]
""".format(
**MOJIBAKE_CATEGORIES
),
re.VERBOSE,
)
def badness(text):
"""
Get the 'badness' of a sequence of text, counting the number of unlikely
character sequences. A badness greater than 0 indicates that some of it
seems to be mojibake.
"""
return len(BADNESS_RE.findall(text))
def is_mojibake_impacted(text):
if not badness.sequence_weirdness(text):
return True
try:
text.encode('sloppy-windows-1252')
except UnicodeEncodeError:
return True
else:
return False
from ftfy import fix_text
print('Grabled text: \033[1m%s\033[0m\nFixed text: \033[1m%s\033[0m' % (df['Short description'][8471],
fix_text(df['Short description'][8471])))
Grabled text: 电脑开机开ä¸å‡ºæ¥ Fixed text: 电脑开机开不出来
def is_mojibake_impacted(text):
if not badness.sequence_weirdness(text):
return True
try:
text.encode('sloppy-windows-1252')
except UnicodeEncodeError:
return True
else:
return False
print('Grabled text: \033[1m%s\033[0m\nFixed text: \033[1m%s\033[0m' % (df['Short description'][8471],
fix_text(df['Short description'][8471])))
Grabled text: 电脑开机开ä¸å‡ºæ¥ Fixed text: 电脑开机开不出来
df.to_csv('mojibake_treated.csv', index=False, encoding='utf_8_sig')
with open('mojibake_treated.pkl', 'wb') as handle:
pickle.dump(df, handle, protocol=pickle.HIGHEST_PROTOCOL)
Comments:
Language Translation (Goslate: Free Google Translate API)
Goslate is an open source python library that implemented Google Translate API. This uses the Google Translate Ajax API to make calls to such methods as detect and translate. It is choosen over another library Googletrans from Google as Goslate is developed to bypass the ticketing mechanism to prevent simple crawler program to access the Ajax API. Hence Goslate with multiple service urls is able to translate the entire dataset in very few iterations without blocking the user's IP address.
svc_domains = ['.com','.com.au','.com.ar','.co.kr','.co.in','.co.jp','.at','.de','.ru','.ch','.fr','.es','.ae']
svc_urls = ['http://translate.google' + domain for domain in svc_domains]
print('Original text: \033[1m%s\033[0m\nFixed text: \033[1m%s\033[0m' % ('电脑开机开不出来', 'Boot the computer does not really come out'))
Original text: 电脑开机开不出来 Fixed text: Boot the computer does not really come out
# Serialize the translated dataset
df.to_csv('translated_ticket.csv', index=False, encoding='utf_8_sig')
with open('translated_ticket.pkl','wb') as f:
pickle.dump(df, f, pickle.HIGHEST_PROTOCOL)
# Load the translated pickle file incase the IP gets blocked
with open('translated_ticket.pkl','rb') as f:
df = pickle.load(f)
Text Preprocessing
Text preprocessing is the process of transferring text from human language to machine-readable format for further processing. After a text is obtained, we start with text normalization. Text normalization includes:
# Define regex patterns
EMAIL_PATTERN = r"([\w.+-]+@[a-z\d-]+\.[a-z\d.-]+)"
PUNCT_PATTERN = r"[,|@|\|?|\\|$&*|%|\r|\n|.:|\s+|/|//|\\|/|\||-|<|>|;|(|)|=|+|#|-|\"|[-\]]|{|}]"
NUMER_PATTERN = r"(?<!RetainedEmailId)(\d+(?:\.\d+)?)"
# Define a function to treat the texts
def cleanseText(text):
text = str(text).lower()
email_dict = extract_email(text)
for key in email_dict.keys():
text = text.replace(email_dict[key], key)
text = re.sub(NUMER_PATTERN, '', text)
text = text.translate(str.maketrans("","", string.punctuation))
text = re.sub(r'\s+', ' ', text)
text = re.sub(r' {2,}', " ", text, flags=re.MULTILINE)
text = text.replace('`',"'")
for key in email_dict.keys():
text = text.replace(key, email_dict[key])
return text.strip()
def extract_email(text):
unique_emailid = set(re.findall(EMAIL_PATTERN, text))
email_replacement = dict()
for idx, email in enumerate(unique_emailid):
email_replacement[f'RetainedEmailId{idx}'] = email
return email_replacement
print('\033[1mOriginal text:\033[0m')
print(df['Description'][32])
print('_'*100)
print('\033[1mCleaned text:\033[0m')
print(cleanseText(df['Description'][32]))
Original text: received from: kxsceyzo.naokumlb@gmail.com gentles, i have two devices that are trying to share an ip address. they are trying to share 96.26.27.9619. one is a printer with the hostname of prtjc0074, and the other is a new display for erp. the display is using dhcp to get its address assigned and the printer is hard coded. my guess is that the address 96.26.27.9619 did not get set to a static address in dhcp. i need this corrected so the display will pick up another address. ____________________________________________________________________________________________________ Cleaned text: received from kxsceyzo.naokumlb@gmail.com gentles i have two devices that are trying to share an ip address they are trying to share one is a printer with the hostname of prtjc and the other is a new display for erp the display is using dhcp to get its address assigned and the printer is hard coded my guess is that the address did not get set to a static address in dhcp i need this corrected so the display will pick up another address
# Apply the cleaning function to entire dataset
df['Description'] = df['Description'].apply(cleanseText)
df['Short description'] = df['Short description'].apply(cleanseText)
# Verify the data
df.tail()
| Short description | Description | Caller | Assignment group | |
|---|---|---|---|---|
| 8495 | emails not coming in from zz mail | xd xd received from avglmrts.vhqmtiua@gmail.co... | avglmrts vhqmtiua | GRP_29 |
| 8496 | telephonysoftware issue | telephonysoftware issue | rbozivdq gmlhrtvp | GRP_0 |
| 8497 | vip windows password reset for tifpdchb pedxruyf | vip windows password reset for tifpdchb pedxruyf | oybwdsgx oxyhwrfz | GRP_0 |
| 8498 | machine nã£o estã¡ funcionando | i am unable to access the machine utilities to... | ufawcgob aowhxjky | GRP_62 |
| 8499 | an mehreren pcs lassen sich verschiedene prgra... | an mehreren pcs lassen sich verschiedene prgra... | kqvbrspl jyzoklfx | GRP_49 |
Stemming and Lemmatization
!pip install spacy
Requirement already satisfied: spacy in /usr/local/lib/python3.7/dist-packages (2.2.4) Requirement already satisfied: requests<3.0.0,>=2.13.0 in /usr/local/lib/python3.7/dist-packages (from spacy) (2.23.0) Requirement already satisfied: srsly<1.1.0,>=1.0.2 in /usr/local/lib/python3.7/dist-packages (from spacy) (1.0.5) Requirement already satisfied: tqdm<5.0.0,>=4.38.0 in /usr/local/lib/python3.7/dist-packages (from spacy) (4.64.0) Requirement already satisfied: blis<0.5.0,>=0.4.0 in /usr/local/lib/python3.7/dist-packages (from spacy) (0.4.1) Requirement already satisfied: catalogue<1.1.0,>=0.0.7 in /usr/local/lib/python3.7/dist-packages (from spacy) (1.0.0) Requirement already satisfied: numpy>=1.15.0 in /usr/local/lib/python3.7/dist-packages (from spacy) (1.21.6) Requirement already satisfied: wasabi<1.1.0,>=0.4.0 in /usr/local/lib/python3.7/dist-packages (from spacy) (0.9.1) Requirement already satisfied: thinc==7.4.0 in /usr/local/lib/python3.7/dist-packages (from spacy) (7.4.0) Requirement already satisfied: preshed<3.1.0,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from spacy) (3.0.6) Requirement already satisfied: cymem<2.1.0,>=2.0.2 in /usr/local/lib/python3.7/dist-packages (from spacy) (2.0.6) Requirement already satisfied: setuptools in /usr/local/lib/python3.7/dist-packages (from spacy) (57.4.0) Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in /usr/local/lib/python3.7/dist-packages (from spacy) (1.0.7) Requirement already satisfied: plac<1.2.0,>=0.9.6 in /usr/local/lib/python3.7/dist-packages (from spacy) (1.1.3) Requirement already satisfied: importlib-metadata>=0.20 in /usr/local/lib/python3.7/dist-packages (from catalogue<1.1.0,>=0.0.7->spacy) (4.11.3) Requirement already satisfied: typing-extensions>=3.6.4 in /usr/local/lib/python3.7/dist-packages (from importlib-metadata>=0.20->catalogue<1.1.0,>=0.0.7->spacy) (4.2.0) Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.7/dist-packages (from importlib-metadata>=0.20->catalogue<1.1.0,>=0.0.7->spacy) (3.8.0) Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests<3.0.0,>=2.13.0->spacy) (2021.10.8) Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests<3.0.0,>=2.13.0->spacy) (2.10) Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests<3.0.0,>=2.13.0->spacy) (3.0.4) Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests<3.0.0,>=2.13.0->spacy) (1.24.3)
import spacy
import en_core_web_sm
# Initialize spacy 'en' medium model, keeping only tagger component needed for lemmatization
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
# Define a function to lemmatize the descriptions
def lemmatizer(sentence):
doc = nlp(sentence)
return " ".join([token.lemma_ for token in doc if token.lemma_ !='-PRON-'])
# Take an example of row# 43 Description and lemmatize it
print('\033[1mOriginal text:\033[0m')
print(df['Description'][43])
print('_'*100)
print('\033[1mLemmatized text:\033[0m')
print(lemmatizer(df['Description'][43]))
Original text: received from yisohglr.uvteflgb@gmail.com hi the printer printer is not working and needs a part replaced can you reroute the jobs in queue to printer printer wihuyjdo qpogfwkb has indicated that prqos needs a new part and it may not deliver for a few days so the inwarehousetools will need to print on printer for now this needs to be taken care of today since the inwarehousetools are printed and are picked up by an outside vendor at pm in usa on a daily basis please contact dkmcfreg anwmfvlgenkataramdntyana if you have questions about the jobs in queue for today ____________________________________________________________________________________________________ Lemmatized text: receive from yisohglr.uvteflgb@gmail.com hi the printer printer be not work and need a part replace can reroute the job in queue to printer printer wihuyjdo qpogfwkb have indicate that prqos need a new part and may not deliver for a few day so the inwarehousetool will need to print on printer for now this need to be take care of today since the inwarehousetool be print and be pick up by an outside vendor at pm in usa on a daily basis please contact dkmcfreg anwmfvlgenkataramdntyana if have question about the job in queue for today
# Apply the Lemmatization to entire dataset
df['Description'] = df['Description'].apply(lemmatizer)
df['Short description'] = df['Short description'].apply(lemmatizer)
# Verify the data
df.tail()
| Short description | Description | Caller | Assignment group | |
|---|---|---|---|---|
| 8495 | email not come in from zz mail | xd xd receive from avglmrts.vhqmtiua@gmail.com... | avglmrts vhqmtiua | GRP_29 |
| 8496 | telephonysoftware issue | telephonysoftware issue | rbozivdq gmlhrtvp | GRP_0 |
| 8497 | vip windows password reset for tifpdchb pedxruyf | vip windows password reset for tifpdchb pedxruyf | oybwdsgx oxyhwrfz | GRP_0 |
| 8498 | machine nã£o estã ¡ funcionando | i be unable to access the machine utility to f... | ufawcgob aowhxjky | GRP_62 |
| 8499 | an mehreren pcs lassen sich verschiedene prgra... | an mehreren pcs lassen sich verschiedene prgra... | kqvbrspl jyzoklfx | GRP_49 |
# Serialize the preprocessed dataset
df.to_csv('preprocessed_ticket.csv', index=False, encoding='utf_8_sig')
with open('preprocessed_ticket.pkl','wb') as f:
pickle.dump(df, f, pickle.HIGHEST_PROTOCOL)
# Create new features of length and word count for both of the description columns
df.insert(1, 'sd_len', df['Short description'].astype(str).apply(len))
df.insert(2, 'sd_word_count', df['Short description'].apply(lambda x: len(str(x).split())))
df.insert(4, 'desc_len', df['Description'].astype(str).apply(len))
df.insert(5, 'desc_word_count', df['Description'].apply(lambda x: len(str(x).split())))
df.head()
| Short description | sd_len | sd_word_count | Description | desc_len | desc_word_count | Caller | Assignment group | |
|---|---|---|---|---|---|---|---|---|
| 0 | login issue | 11 | 2 | verify user detailsemployee manager namexd che... | 183 | 31 | spxjnwir pjlcoqds | GRP_0 |
| 1 | outlook | 7 | 1 | xd xd receive from hmjdrvpb.komuaywn@gmail.com... | 184 | 28 | hmjdrvpb komuaywn | GRP_0 |
| 2 | can not log in to vpn | 21 | 6 | xd xd receive from eylqgodm.ybqkwiam@gmail.com... | 93 | 17 | eylqgodm ybqkwiam | GRP_0 |
| 3 | unable to access hrtool page | 28 | 5 | unable to access hrtool page | 28 | 5 | xbkucsvz gcpydteq | GRP_0 |
| 4 | skype error | 11 | 2 | skype error | 11 | 2 | owlgqjme qhcozdfx | GRP_0 |
Exploratory Data Analysis
Exploratory Data Analysis (EDA) is an approach/philosophy for data analysis that employs a variety of techniques (mostly graphical) to
Visually representing the content of a text document is one of the most important tasks in the field of text mining. It helps not only to explore the content of documents from different aspects and at different levels of details, but also helps in summarizing a single document, show the words and topics, detect events, and create storylines.
We'll be using plotly library to generate the graphs and visualizations. We need cufflinks to link plotly to pandas dataframe and add the iplot method
!pip install plotly cufflinks
Requirement already satisfied: plotly in /usr/local/lib/python3.7/dist-packages (5.5.0) Requirement already satisfied: cufflinks in /usr/local/lib/python3.7/dist-packages (0.17.3) Requirement already satisfied: six in /usr/local/lib/python3.7/dist-packages (from plotly) (1.15.0) Requirement already satisfied: tenacity>=6.2.0 in /usr/local/lib/python3.7/dist-packages (from plotly) (8.0.1) Requirement already satisfied: ipywidgets>=7.0.0 in /usr/local/lib/python3.7/dist-packages (from cufflinks) (7.7.0) Requirement already satisfied: numpy>=1.9.2 in /usr/local/lib/python3.7/dist-packages (from cufflinks) (1.21.6) Requirement already satisfied: colorlover>=0.2.1 in /usr/local/lib/python3.7/dist-packages (from cufflinks) (0.3.0) Requirement already satisfied: ipython>=5.3.0 in /usr/local/lib/python3.7/dist-packages (from cufflinks) (5.5.0) Requirement already satisfied: pandas>=0.19.2 in /usr/local/lib/python3.7/dist-packages (from cufflinks) (1.3.5) Requirement already satisfied: setuptools>=34.4.1 in /usr/local/lib/python3.7/dist-packages (from cufflinks) (57.4.0) Requirement already satisfied: simplegeneric>0.8 in /usr/local/lib/python3.7/dist-packages (from ipython>=5.3.0->cufflinks) (0.8.1) Requirement already satisfied: pexpect in /usr/local/lib/python3.7/dist-packages (from ipython>=5.3.0->cufflinks) (4.8.0) Requirement already satisfied: pickleshare in /usr/local/lib/python3.7/dist-packages (from ipython>=5.3.0->cufflinks) (0.7.5) Requirement already satisfied: decorator in /usr/local/lib/python3.7/dist-packages (from ipython>=5.3.0->cufflinks) (4.4.2) Requirement already satisfied: traitlets>=4.2 in /usr/local/lib/python3.7/dist-packages (from ipython>=5.3.0->cufflinks) (5.1.1) Requirement already satisfied: prompt-toolkit<2.0.0,>=1.0.4 in /usr/local/lib/python3.7/dist-packages (from ipython>=5.3.0->cufflinks) (1.0.18) Requirement already satisfied: pygments in /usr/local/lib/python3.7/dist-packages (from ipython>=5.3.0->cufflinks) (2.6.1) Requirement already satisfied: nbformat>=4.2.0 in /usr/local/lib/python3.7/dist-packages (from ipywidgets>=7.0.0->cufflinks) (5.3.0) Requirement already satisfied: ipython-genutils~=0.2.0 in /usr/local/lib/python3.7/dist-packages (from ipywidgets>=7.0.0->cufflinks) (0.2.0) Requirement already satisfied: ipykernel>=4.5.1 in /usr/local/lib/python3.7/dist-packages (from ipywidgets>=7.0.0->cufflinks) (4.10.1) Requirement already satisfied: jupyterlab-widgets>=1.0.0 in /usr/local/lib/python3.7/dist-packages (from ipywidgets>=7.0.0->cufflinks) (1.1.0) Requirement already satisfied: widgetsnbextension~=3.6.0 in /usr/local/lib/python3.7/dist-packages (from ipywidgets>=7.0.0->cufflinks) (3.6.0) Requirement already satisfied: tornado>=4.0 in /usr/local/lib/python3.7/dist-packages (from ipykernel>=4.5.1->ipywidgets>=7.0.0->cufflinks) (5.1.1) Requirement already satisfied: jupyter-client in /usr/local/lib/python3.7/dist-packages (from ipykernel>=4.5.1->ipywidgets>=7.0.0->cufflinks) (5.3.5) Requirement already satisfied: fastjsonschema in /usr/local/lib/python3.7/dist-packages (from nbformat>=4.2.0->ipywidgets>=7.0.0->cufflinks) (2.15.3) Requirement already satisfied: jupyter-core in /usr/local/lib/python3.7/dist-packages (from nbformat>=4.2.0->ipywidgets>=7.0.0->cufflinks) (4.10.0) Requirement already satisfied: jsonschema>=2.6 in /usr/local/lib/python3.7/dist-packages (from nbformat>=4.2.0->ipywidgets>=7.0.0->cufflinks) (4.3.3) Requirement already satisfied: typing-extensions in /usr/local/lib/python3.7/dist-packages (from jsonschema>=2.6->nbformat>=4.2.0->ipywidgets>=7.0.0->cufflinks) (4.2.0) Requirement already satisfied: importlib-metadata in /usr/local/lib/python3.7/dist-packages (from jsonschema>=2.6->nbformat>=4.2.0->ipywidgets>=7.0.0->cufflinks) (4.11.3) Requirement already satisfied: pyrsistent!=0.17.0,!=0.17.1,!=0.17.2,>=0.14.0 in /usr/local/lib/python3.7/dist-packages (from jsonschema>=2.6->nbformat>=4.2.0->ipywidgets>=7.0.0->cufflinks) (0.18.1) Requirement already satisfied: importlib-resources>=1.4.0 in /usr/local/lib/python3.7/dist-packages (from jsonschema>=2.6->nbformat>=4.2.0->ipywidgets>=7.0.0->cufflinks) (5.7.1) Requirement already satisfied: attrs>=17.4.0 in /usr/local/lib/python3.7/dist-packages (from jsonschema>=2.6->nbformat>=4.2.0->ipywidgets>=7.0.0->cufflinks) (21.4.0) Requirement already satisfied: zipp>=3.1.0 in /usr/local/lib/python3.7/dist-packages (from importlib-resources>=1.4.0->jsonschema>=2.6->nbformat>=4.2.0->ipywidgets>=7.0.0->cufflinks) (3.8.0) Requirement already satisfied: pytz>=2017.3 in /usr/local/lib/python3.7/dist-packages (from pandas>=0.19.2->cufflinks) (2022.1) Requirement already satisfied: python-dateutil>=2.7.3 in /usr/local/lib/python3.7/dist-packages (from pandas>=0.19.2->cufflinks) (2.8.2) Requirement already satisfied: wcwidth in /usr/local/lib/python3.7/dist-packages (from prompt-toolkit<2.0.0,>=1.0.4->ipython>=5.3.0->cufflinks) (0.2.5) Requirement already satisfied: notebook>=4.4.1 in /usr/local/lib/python3.7/dist-packages (from widgetsnbextension~=3.6.0->ipywidgets>=7.0.0->cufflinks) (5.3.1) Requirement already satisfied: jinja2 in /usr/local/lib/python3.7/dist-packages (from notebook>=4.4.1->widgetsnbextension~=3.6.0->ipywidgets>=7.0.0->cufflinks) (2.11.3) Requirement already satisfied: Send2Trash in /usr/local/lib/python3.7/dist-packages (from notebook>=4.4.1->widgetsnbextension~=3.6.0->ipywidgets>=7.0.0->cufflinks) (1.8.0) Requirement already satisfied: nbconvert in /usr/local/lib/python3.7/dist-packages (from notebook>=4.4.1->widgetsnbextension~=3.6.0->ipywidgets>=7.0.0->cufflinks) (5.6.1) Requirement already satisfied: terminado>=0.8.1 in /usr/local/lib/python3.7/dist-packages (from notebook>=4.4.1->widgetsnbextension~=3.6.0->ipywidgets>=7.0.0->cufflinks) (0.13.3) Requirement already satisfied: pyzmq>=13 in /usr/local/lib/python3.7/dist-packages (from jupyter-client->ipykernel>=4.5.1->ipywidgets>=7.0.0->cufflinks) (22.3.0) Requirement already satisfied: ptyprocess in /usr/local/lib/python3.7/dist-packages (from terminado>=0.8.1->notebook>=4.4.1->widgetsnbextension~=3.6.0->ipywidgets>=7.0.0->cufflinks) (0.7.0) Requirement already satisfied: MarkupSafe>=0.23 in /usr/local/lib/python3.7/dist-packages (from jinja2->notebook>=4.4.1->widgetsnbextension~=3.6.0->ipywidgets>=7.0.0->cufflinks) (2.0.1) Requirement already satisfied: mistune<2,>=0.8.1 in /usr/local/lib/python3.7/dist-packages (from nbconvert->notebook>=4.4.1->widgetsnbextension~=3.6.0->ipywidgets>=7.0.0->cufflinks) (0.8.4) Requirement already satisfied: defusedxml in /usr/local/lib/python3.7/dist-packages (from nbconvert->notebook>=4.4.1->widgetsnbextension~=3.6.0->ipywidgets>=7.0.0->cufflinks) (0.7.1) Requirement already satisfied: pandocfilters>=1.4.1 in /usr/local/lib/python3.7/dist-packages (from nbconvert->notebook>=4.4.1->widgetsnbextension~=3.6.0->ipywidgets>=7.0.0->cufflinks) (1.5.0) Requirement already satisfied: entrypoints>=0.2.2 in /usr/local/lib/python3.7/dist-packages (from nbconvert->notebook>=4.4.1->widgetsnbextension~=3.6.0->ipywidgets>=7.0.0->cufflinks) (0.4) Requirement already satisfied: testpath in /usr/local/lib/python3.7/dist-packages (from nbconvert->notebook>=4.4.1->widgetsnbextension~=3.6.0->ipywidgets>=7.0.0->cufflinks) (0.6.0) Requirement already satisfied: bleach in /usr/local/lib/python3.7/dist-packages (from nbconvert->notebook>=4.4.1->widgetsnbextension~=3.6.0->ipywidgets>=7.0.0->cufflinks) (5.0.0) Requirement already satisfied: webencodings in /usr/local/lib/python3.7/dist-packages (from bleach->nbconvert->notebook>=4.4.1->widgetsnbextension~=3.6.0->ipywidgets>=7.0.0->cufflinks) (0.5.1)
print('Plotly:', py.__version__)
print('Cufflinks:', cf.__version__)
Plotly: 5.5.0 Cufflinks: 0.17.3
Univariate visualization
Single-variable or univariate visualization is the simplest type of visualization which consists of observations on only a single characteristic or attribute. Univariate visualization includes histogram, bar plots and line charts.
import plotly.io as pio
pio.renderers.default = "colab"
# Assignment group distribution
print('\033[1mTotal assignment groups:\033[0m', df['Assignment group'].nunique())
# Histogram
df['Assignment group'].iplot(
kind='hist',
xTitle='Assignment Group',
yTitle='count',
title='Assignment Group Distribution- Histogram (Fig-1)')
Total assignment groups: 74
# Pie chart
assgn_grp = pd.DataFrame(df.groupby('Assignment group').size(),columns = ['Count']).reset_index()
assgn_grp.iplot(
kind='pie',
labels='Assignment group',
values='Count',
title='Assignment Group Distribution- Pie Chart (Fig-2)',
hoverinfo="label+percent+name", hole=0.25)
# Bar plot
df['Assignment group'].iplot(
kind='bar',
yTitle='Assignment Group',
xTitle='Record #',
colorscale='-plotly',
title='Assignment Group Distribution- Bar Chart (Fig-3)')
# Find out the Assignment Groups with less than equal to 30 tickets assigned
rare_ticket = df.groupby(['Assignment group']).filter(lambda x: len(x) <= 30)
print('\033[1m#Groups with less than equal to 30 tickets assigned:\033[0m', rare_ticket['Assignment group'].nunique())
rare_ticket['Assignment group'].iplot(
kind='hist',
xTitle='Assignment Group',
yTitle='count',
colorscale='-orrd',
title='#Records by rare Assignment Groups- Histogram (Fig-4)')
#Groups with less than equal to 30 tickets assigned: 40
# Distribution of Assignment groups excluding GRP_0 & rare groups (groups with less than equal 30 tickets assigned)
excluded_grp = ['GRP_0']
excluded_grp.extend(rare_ticket['Assignment group'].unique())
filtered_tkt = df[~df['Assignment group'].isin(excluded_grp)]
# Pie chart
filtered_assgn_grp = pd.DataFrame(filtered_tkt.groupby('Assignment group').size(),columns = ['Count']).reset_index()
filtered_assgn_grp.iplot(
kind='pie',
labels='Assignment group',
values='Count',
title='#Records by Assignment groups(excluding GRP_0 and rare groups)- Pie Chart (Fig-5)',
pull=np.linspace(0,0.3,filtered_assgn_grp['Assignment group'].nunique()))
# Histogram
filtered_tkt['Assignment group'].iplot(
kind='histogram',
xTitle='Assignment Group',
yTitle='count',
colorscale='-gnbu',
title='#Records by Assignment groups(excluding GRP_0 and rare groups)- Histogram (Fig-6)')
Comments:
The distribution of Callers
# Find out top 10 callers in terms of frequency of raising tickets in the entire dataset
print('\033[1mTotal caller count:\033[0m', df['Caller'].nunique())
df1 = pd.DataFrame(df.groupby(['Caller']).size().nlargest(10), columns=['Count']).reset_index()
df1.iplot(kind='pie',
labels='Caller',
values='Count',
title='Top 10 caller- Pie Chart (Fig-7)',
colorscale='-spectral',
pull=[0,0,0,0,0.05,0.1,0.15,0.2,0.25,0.3])
Total caller count: 2950
# Top 5 callers in each assignment group
top_n = 5
s = df['Caller'].groupby(df['Assignment group']).value_counts()
caller_grp = pd.DataFrame(s.groupby(level=0).nlargest(top_n).reset_index(level=0, drop=True))
caller_grp.head(15)
| Caller | ||
|---|---|---|
| Assignment group | Caller | |
| GRP_0 | fumkcsji sarmtlhy | 132 |
| rbozivdq gmlhrtvp | 86 | |
| olckhmvx pcqobjnd | 54 | |
| efbwiadp dicafxhv | 45 | |
| mfeyouli ndobtzpw | 13 | |
| GRP_1 | bpctwhsn kzqsbmtp | 6 |
| jloygrwh acvztedi | 4 | |
| jyoqwxhz clhxsoqy | 3 | |
| spxqmiry zpwgoqju | 3 | |
| kbnfxpsy gehxzayq | 2 | |
| GRP_10 | bpctwhsn kzqsbmtp | 60 |
| ihfkwzjd erbxoyqk | 6 | |
| dizquolf hlykecxa | 5 | |
| gnasmtvx cwxtsvkm | 3 | |
| hlrmufzx qcdzierm | 3 |
# Visualize Top 5 callers in each of top 10 assignment groups
top_n = 10
top_grps = assgn_grp.nlargest(top_n, 'Count')['Assignment group'].tolist()
fig_cols = 5
fig_rows = int(np.ceil(top_n/fig_cols))
fig, axes = plt.subplots(fig_rows, fig_cols, figsize=(13,9.5))
fig.suptitle('Top 5 callers in each of top 10 assignment groups- Pie Chart (Fig-8)', y=1, va= 'bottom', size='20')
for row in range(fig_rows):
for col in range(fig_cols):
grp_n = fig_cols * row + col
if grp_n < top_n:
xs = caller_grp.xs(top_grps[grp_n])
_ = axes[row,col].pie(xs, autopct='%1.1f%%', explode=[0.05]*5)
axes[row,col].legend(labels=xs.index,loc="best")
axes[row,col].axis('equal')
axes[row,col].set_title(top_grps[grp_n])
plt.tight_layout()
# Check if any caller appears to raise ticket for multiple groups
mul_caller = caller_grp[caller_grp.Caller.duplicated()]
uni_mul_caller = [idx[1] for idx in mul_caller.index[mul_caller.Caller.unique()]]
print(f'\033[1mFollowing {len(uni_mul_caller)} callers happen to raise tickets for multiple groups:\033[0m\n')
print(uni_mul_caller)
mul_caller
Following 15 callers happen to raise tickets for multiple groups:
['hlrmufzx qcdzierm', 'fbgetczn jlsvxura', 'gnasmtvx cwxtsvkm', 'ihfkwzjd erbxoyqk', 'tqfnalpj qyoscnge', 'fmqubnvs kcxpeyiv', 'tghrloks jbgcvlmf', 'jwqyxbzs adpvilqu', 'nuhfwplj ojcwxser', 'oldrctiu bxurpsyi', 'vlymsnej whlqxcst', 'dkmcfreg anwmfvlg', 'bpctwhsn kzqsbmtp', 'spxqmiry zpwgoqju', 'obanjrhg rnafleys']
| Caller | ||
|---|---|---|
| Assignment group | Caller | |
| GRP_1 | spxqmiry zpwgoqju | 3 |
| GRP_10 | ihfkwzjd erbxoyqk | 6 |
| gnasmtvx cwxtsvkm | 3 | |
| hlrmufzx qcdzierm | 3 | |
| GRP_11 | tghrloks jbgcvlmf | 2 |
| ... | ... | ... |
| GRP_73 | kcnosyae zlpmfxgs | 1 |
| GRP_8 | ZkBogxib QsEJzdZO | 54 |
| GRP_9 | ctzykflo evzbhgru | 3 |
| sholvcmf bjtpomrl | 3 | |
| urhpnlaf agmsfqil | 3 |
281 rows × 1 columns
Comments:
The distribution of Short description lengths
# Serialize the preprocessed dataset
df.to_csv('preprocessed_ticket.csv', index=False, encoding='utf_8_sig')
with open('preprocessed_ticket.pkl','wb') as f:
pickle.dump(df, f, pickle.HIGHEST_PROTOCOL)
# Short Desc text length
df['sd_len'].iplot(
kind='scatter',
xTitle='text length',
yTitle='count',
title='Short Desc. Text Length Distribution (Fig-9)')
# Short desc word count
df['sd_word_count'].iplot(
kind='hist',
bins=100,
xTitle='word count',
linecolor='black',
yTitle='count',
colorscale='pastel1',
title='Short desc. Word Count Distribution (Fig-10)')
The distribution of Description lengths
# Description text length
df['desc_len'].iplot(
kind='bar',
xTitle='text length',
yTitle='count',
colorscale='-ylgn',
title='Description Text Length Distribution (Fig-11)')
# Description word count
df['desc_word_count'].iplot(
kind='bar',
xTitle='word count',
linecolor='black',
yTitle='count',
colorscale='-bupu',
title='Description Word Count Distribution (Fig-12)')
df.insert(loc=8,
column='Summary',
allow_duplicates=True,
value=list(df['Short description'].str.strip() + ' ' + df['Description'].str.strip()))
STOP_WORDS = STOPWORDS.union({'yes','na','hi',
'receive','hello',
'regards','thanks',
'from','greeting',
'forward','reply',
'will','please',
'see','help','able'})
def get_top_n_ngrams(corpus, top_n=None, ngram_range=(1,1), stopwords=None):
vec = CountVectorizer(ngram_range=ngram_range,
stop_words=stopwords).fit(corpus)
bag_of_words = vec.transform(corpus)
sum_words = bag_of_words.sum(axis=0)
words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
words_freq = sorted(words_freq, key = lambda x: x[1], reverse=True)
return words_freq[:top_n]
# Top 50 Unigrams before removing stop words
top_n = 50
ngram_range = (1,1)
uni_grams = get_top_n_ngrams(df.Summary, top_n, ngram_range)
df1 = pd.DataFrame(uni_grams, columns = ['Summary' , 'count'])
df1.groupby('Summary').sum()['count'].sort_values(ascending=False).iplot(
kind='bar',
yTitle='Count',
linecolor='black',
colorscale='piyg',
title=f'Top {top_n} Unigrams in Summary')
# Top 50 Unigrams after removing stop words
uni_grams_sw = get_top_n_ngrams(df.Summary, top_n, ngram_range, stopwords=STOP_WORDS)
df1 = pd.DataFrame(uni_grams_sw, columns = ['Summary' , 'count'])
df1.groupby('Summary').sum()['count'].sort_values(ascending=False).iplot(
kind='bar',
yTitle='Count',
linecolor='black',
colorscale='-piyg',
title=f'Top {top_n} Unigrams in Summary without stop words')
def generate_word_clod(corpus):
wordcloud = WordCloud(width = 800, height = 800,
background_color ='white',
stopwords=STOP_WORDS,
# mask=mask,
min_font_size = 10).generate(corpus)
plt.figure(figsize = (12, 12), facecolor = None)
plt.imshow(wordcloud)
plt.axis("off")
plt.tight_layout(pad = 0)
plt.show()
# Word Cloud for all tickets assigned to GRP_0
generate_word_clod(' '.join(df[df['Assignment group'] == 'GRP_0'].Summary.str.strip()))
# Word Cloud for all tickets assigned to GRP_8
generate_word_clod(' '.join(df[df['Assignment group'] == 'GRP_8'].Summary.str.strip()))
# Word Cloud for all tickets assigned to GRP_24
generate_word_clod(' '.join(df[df['Assignment group'] == 'GRP_24'].Summary.str.strip()))
# Word Cloud for all tickets assigned to GRP_12
generate_word_clod(' '.join(df[df['Assignment group'] == 'GRP_12'].Summary.str.strip()))
# Word Cloud for all tickets assigned to GRP_9
generate_word_clod(' '.join(df[df['Assignment group'] == 'GRP_9'].Summary.str.strip()))
# Word Cloud for all tickets assigned to GRP_2
generate_word_clod(' '.join(df[df['Assignment group'] == 'GRP_2'].Summary.str.strip()))
# Word Cloud for all tickets assigned to GRP_19
generate_word_clod(' '.join(df[df['Assignment group'] == 'GRP_19'].Summary.str.strip()))
# Word Cloud for all tickets assigned to GRP_3
generate_word_clod(' '.join(df[df['Assignment group'] == 'GRP_3'].Summary.str.strip()))
# Word Cloud for all tickets assigned to GRP_6
generate_word_clod(' '.join(df[df['Assignment group'] == 'GRP_6'].Summary.str.strip()))
# Word Cloud for all tickets assigned to GRP_13
generate_word_clod(' '.join(df[df['Assignment group'] == 'GRP_13'].Summary.str.strip()))
# Generate wordcloud for ticket Short description
generate_word_clod(' '.join(df['Short description'].str.strip()))
# Generate wordcloud for ticket Description
generate_word_clod(' '.join(df.Description.str.strip()))
# Generate wordcloud for ticket Summary
generate_word_clod(' '.join(df.Summary.str.strip()))
# Serialize the dataset after EDA
with open('model_ready.pkl','wb') as f:
pickle.dump(df, f, pickle.HIGHEST_PROTOCOL)
Comments:
job fail (897 times)
Analysis on GRP_0 which is the most frequent group to assign a ticket to reveals that this group deals with mostly the maintenance problems such as password reset, account lock, login issue, ticket update etc.
Model Building
Let's proceed towards trying different model architectures mentioned below to classify the problem and validate which one is outperforming.
Let's create another column of categorical datatype from Assignment groups. Let's write some generic methods for utilities and to plot evaluation metrics.
# Create a target categorical column
df['target'] = df['Assignment group'].astype('category').cat.codes
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 8500 entries, 0 to 8499 Data columns (total 10 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Short description 8500 non-null object 1 sd_len 8500 non-null int64 2 sd_word_count 8500 non-null int64 3 Description 8500 non-null object 4 desc_len 8500 non-null int64 5 desc_word_count 8500 non-null int64 6 Caller 8500 non-null object 7 Assignment group 8500 non-null object 8 Summary 8500 non-null object 9 target 8500 non-null int8 dtypes: int64(4), int8(1), object(5) memory usage: 606.1+ KB
# A class that logs the time
class Timer():
'''
A generic class to log the time
'''
def __init__(self):
self.start_ts = None
def start(self):
self.start_ts = time()
def stop(self):
return 'Time taken: %2fs' % (time()-self.start_ts)
timer = Timer()
def plot_prec_recall_vs_thresh(precisions, recalls, thresholds):
plt.figure(figsize=(10,5))
plt.plot(thresholds, precisions[:-1], 'b--', label='precision')
plt.plot(thresholds, recalls[:-1], 'g--', label = 'recall')
plt.xlabel('Threshold')
plt.legend()
def run_classification(estimator, X_train, X_test, y_train, y_test, arch_name=None, pipelineRequired=True, isDeepModel=False):
timer.start()
# train the model
clf = estimator
if pipelineRequired :
clf = Pipeline([('vect', CountVectorizer()),
('tfidf', TfidfTransformer()),
('clf', estimator),
])
if isDeepModel :
clf.fit(X_train, y_train, validation_data=(X_test, y_test),epochs=10, batch_size=128,verbose=1,callbacks=call_backs(arch_name))
# predict from the claffier
y_pred = clf.predict(X_test)
y_pred = np.argmax(y_pred, axis=1)
y_train_pred = clf.predict(X_train)
y_train_pred = np.argmax(y_train_pred, axis=1)
else :
clf.fit(X_train, y_train)
# predict from the claffier
y_pred = clf.predict(X_test)
y_train_pred = clf.predict(X_train)
print('Estimator:', clf)
print('='*80)
print('Training accuracy: %.2f%%' % (accuracy_score(y_train,y_train_pred) * 100))
print('Testing accuracy: %.2f%%' % (accuracy_score(y_test, y_pred) * 100))
print('='*80)
print('Confusion matrix:\n %s' % (confusion_matrix(y_test, y_pred)))
print('='*80)
print('Classification report:\n %s' % (classification_report(y_test, y_pred)))
print(timer.stop(), 'to run the model')
# Create training and test datasets with 80:20 ratio
X_train, X_test, y_train, y_test = train_test_split(df.Summary,
df.target,
test_size=0.20,
random_state=42)
print('\033[1mShape of the training set:\033[0m', X_train.shape, X_test.shape)
print('\033[1mShape of the test set:\033[0m', y_train.shape, y_test.shape)
Shape of the training set: (6800,) (1700,) Shape of the test set: (6800,) (1700,)
Naive Bayes Classifier
Naive Bayes is a simple technique for constructing classifiers: models that assign class labels to problem instances, represented as vectors of feature values, where the class labels are drawn from some finite set. There is not a single algorithm for training such classifiers, but a family of algorithms based on a common principle: all naive Bayes classifiers assume that the value of a particular feature is independent of the value of any other feature, given the class variable.
Advantages:
Disadvantages:
run_classification(MultinomialNB(), X_train, X_test, y_train, y_test)
Estimator: Pipeline(steps=[('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
('clf', MultinomialNB())])
================================================================================
Training accuracy: 56.93%
Testing accuracy: 53.24%
================================================================================
Confusion matrix:
[[761 0 0 ... 0 0 0]
[ 3 0 0 ... 0 2 0]
[ 15 0 0 ... 0 9 0]
...
[ 1 0 0 ... 0 0 0]
[ 15 0 0 ... 0 106 0]
[ 18 0 0 ... 0 38 0]]
================================================================================
Classification report:
precision recall f1-score support
0 0.53 1.00 0.69 761
1 0.00 0.00 0.00 8
2 0.00 0.00 0.00 24
3 0.00 0.00 0.00 5
4 0.64 0.21 0.32 42
5 0.00 0.00 0.00 26
6 0.00 0.00 0.00 20
7 0.00 0.00 0.00 8
8 0.00 0.00 0.00 20
9 0.00 0.00 0.00 17
10 0.00 0.00 0.00 18
11 0.00 0.00 0.00 58
12 0.00 0.00 0.00 51
13 0.00 0.00 0.00 5
14 0.00 0.00 0.00 7
15 0.00 0.00 0.00 5
16 0.00 0.00 0.00 3
17 1.00 0.40 0.57 72
18 0.00 0.00 0.00 20
19 0.00 0.00 0.00 13
20 0.00 0.00 0.00 2
21 0.00 0.00 0.00 8
22 0.00 0.00 0.00 20
23 0.00 0.00 0.00 42
24 0.00 0.00 0.00 6
25 0.00 0.00 0.00 24
27 0.00 0.00 0.00 14
28 0.00 0.00 0.00 12
29 0.00 0.00 0.00 1
30 0.00 0.00 0.00 2
31 0.00 0.00 0.00 2
32 0.00 0.00 0.00 2
33 0.00 0.00 0.00 6
34 0.00 0.00 0.00 26
35 0.00 0.00 0.00 8
36 0.00 0.00 0.00 10
37 0.00 0.00 0.00 8
39 0.00 0.00 0.00 1
40 0.00 0.00 0.00 12
41 0.00 0.00 0.00 2
42 0.00 0.00 0.00 9
43 0.00 0.00 0.00 6
45 0.00 0.00 0.00 28
46 0.00 0.00 0.00 2
47 0.00 0.00 0.00 1
48 0.00 0.00 0.00 1
49 0.00 0.00 0.00 3
51 0.00 0.00 0.00 1
56 0.00 0.00 0.00 46
57 0.00 0.00 0.00 5
59 0.00 0.00 0.00 5
60 0.00 0.00 0.00 1
62 0.00 0.00 0.00 2
63 0.00 0.00 0.00 1
64 0.00 0.00 0.00 1
66 0.00 0.00 0.00 1
67 0.00 0.00 0.00 18
70 0.00 0.00 0.00 1
72 0.48 0.88 0.62 121
73 0.00 0.00 0.00 56
accuracy 0.53 1700
macro avg 0.04 0.04 0.04 1700
weighted avg 0.33 0.53 0.39 1700
Time taken: 0.565176s to run the model
K-nearest Neighbor
run_classification(KNeighborsClassifier(), X_train, X_test, y_train, y_test)
Estimator: Pipeline(steps=[('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
('clf', KNeighborsClassifier())])
================================================================================
Training accuracy: 70.13%
Testing accuracy: 62.88%
================================================================================
Confusion matrix:
[[742 0 0 ... 0 0 0]
[ 2 0 0 ... 0 2 0]
[ 10 0 9 ... 0 0 1]
...
[ 1 0 0 ... 0 0 0]
[ 5 0 2 ... 0 99 8]
[ 15 0 0 ... 0 28 10]]
================================================================================
Classification report:
precision recall f1-score support
0 0.63 0.98 0.77 761
1 0.00 0.00 0.00 8
2 0.75 0.38 0.50 24
3 1.00 0.40 0.57 5
4 0.50 0.36 0.42 42
5 0.44 0.27 0.33 26
6 0.50 0.25 0.33 20
7 0.00 0.00 0.00 8
8 0.50 0.05 0.09 20
9 1.00 0.59 0.74 17
10 0.50 0.28 0.36 18
11 0.53 0.16 0.24 58
12 0.67 0.39 0.49 51
13 0.00 0.00 0.00 5
14 0.00 0.00 0.00 7
15 0.00 0.00 0.00 5
16 0.00 0.00 0.00 3
17 0.97 0.81 0.88 72
18 0.50 0.15 0.23 20
19 1.00 0.08 0.14 13
20 0.00 0.00 0.00 2
21 0.00 0.00 0.00 8
22 0.80 0.20 0.32 20
23 0.60 0.14 0.23 42
24 0.29 0.67 0.40 6
25 0.44 0.17 0.24 24
27 0.57 0.29 0.38 14
28 1.00 0.08 0.15 12
29 0.00 0.00 0.00 1
30 1.00 1.00 1.00 2
31 0.00 0.00 0.00 2
32 0.00 0.00 0.00 2
33 0.33 0.17 0.22 6
34 0.00 0.00 0.00 26
35 0.50 0.12 0.20 8
36 1.00 0.20 0.33 10
37 1.00 0.38 0.55 8
39 0.00 0.00 0.00 1
40 0.50 0.08 0.14 12
41 0.00 0.00 0.00 2
42 0.00 0.00 0.00 9
43 1.00 0.33 0.50 6
45 1.00 0.36 0.53 28
46 0.00 0.00 0.00 2
47 0.00 0.00 0.00 1
48 0.00 0.00 0.00 1
49 0.00 0.00 0.00 3
51 0.00 0.00 0.00 1
54 0.00 0.00 0.00 0
56 0.74 0.54 0.62 46
57 0.00 0.00 0.00 5
59 0.33 0.20 0.25 5
60 0.00 0.00 0.00 1
62 0.00 0.00 0.00 2
63 0.00 0.00 0.00 1
64 0.00 0.00 0.00 1
66 0.00 0.00 0.00 1
67 0.33 0.11 0.17 18
70 0.00 0.00 0.00 1
72 0.58 0.82 0.68 121
73 0.48 0.18 0.26 56
accuracy 0.63 1700
macro avg 0.36 0.18 0.22 1700
weighted avg 0.60 0.63 0.56 1700
Time taken: 3.094667s to run the model
Support Vector Machine (SVM)
# SVM with Linear kernel
run_classification(LinearSVC(), X_train, X_test, y_train, y_test)
Estimator: Pipeline(steps=[('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
('clf', LinearSVC())])
================================================================================
Training accuracy: 93.40%
Testing accuracy: 68.35%
================================================================================
Confusion matrix:
[[711 0 0 ... 0 0 1]
[ 0 0 0 ... 0 1 0]
[ 4 0 14 ... 0 1 0]
...
[ 1 0 0 ... 0 0 0]
[ 0 1 2 ... 0 107 1]
[ 7 0 0 ... 0 35 10]]
================================================================================
Classification report:
precision recall f1-score support
0 0.75 0.93 0.83 761
1 0.00 0.00 0.00 8
2 0.64 0.58 0.61 24
3 0.67 0.40 0.50 5
4 0.49 0.60 0.54 42
5 0.56 0.54 0.55 26
6 0.45 0.45 0.45 20
7 0.50 0.12 0.20 8
8 0.27 0.15 0.19 20
9 0.88 0.88 0.88 17
10 0.56 0.28 0.37 18
11 0.50 0.24 0.33 58
12 0.54 0.49 0.52 51
13 1.00 0.20 0.33 5
14 0.67 0.29 0.40 7
15 0.33 0.20 0.25 5
16 0.75 1.00 0.86 3
17 0.94 0.92 0.93 72
18 0.79 0.55 0.65 20
19 0.50 0.08 0.13 13
20 0.00 0.00 0.00 2
21 1.00 0.12 0.22 8
22 0.55 0.60 0.57 20
23 0.43 0.36 0.39 42
24 0.36 0.83 0.50 6
25 0.75 0.38 0.50 24
27 0.40 0.43 0.41 14
28 0.75 0.25 0.38 12
29 0.00 0.00 0.00 1
30 1.00 1.00 1.00 2
31 0.00 0.00 0.00 2
32 1.00 0.50 0.67 2
33 0.67 0.33 0.44 6
34 0.22 0.08 0.11 26
35 0.00 0.00 0.00 8
36 0.88 0.70 0.78 10
37 0.60 0.38 0.46 8
39 0.00 0.00 0.00 1
40 0.50 0.08 0.14 12
41 0.00 0.00 0.00 2
42 1.00 0.11 0.20 9
43 1.00 0.50 0.67 6
45 0.76 0.46 0.58 28
46 0.00 0.00 0.00 2
47 1.00 1.00 1.00 1
48 0.00 0.00 0.00 1
49 0.00 0.00 0.00 3
51 1.00 1.00 1.00 1
56 0.84 0.57 0.68 46
57 1.00 0.20 0.33 5
59 0.00 0.00 0.00 5
60 0.00 0.00 0.00 1
62 0.00 0.00 0.00 2
63 0.00 0.00 0.00 1
64 0.00 0.00 0.00 1
66 0.00 0.00 0.00 1
67 0.54 0.39 0.45 18
70 0.00 0.00 0.00 1
72 0.57 0.88 0.69 121
73 0.56 0.18 0.27 56
accuracy 0.68 1700
macro avg 0.49 0.34 0.37 1700
weighted avg 0.66 0.68 0.65 1700
Time taken: 1.310083s to run the model
# SVM with RBF kernel
run_classification(SVC(kernel='rbf'), X_train, X_test, y_train, y_test)
Estimator: Pipeline(steps=[('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
('clf', SVC())])
================================================================================
Training accuracy: 82.46%
Testing accuracy: 62.29%
================================================================================
Confusion matrix:
[[753 0 0 ... 0 0 0]
[ 2 0 0 ... 0 1 0]
[ 14 0 9 ... 0 1 0]
...
[ 1 0 0 ... 0 0 0]
[ 4 0 2 ... 0 107 1]
[ 16 0 0 ... 0 35 3]]
================================================================================
Classification report:
precision recall f1-score support
0 0.60 0.99 0.75 761
1 0.00 0.00 0.00 8
2 0.82 0.38 0.51 24
3 0.00 0.00 0.00 5
4 0.50 0.43 0.46 42
5 0.55 0.23 0.32 26
6 1.00 0.25 0.40 20
7 0.00 0.00 0.00 8
8 0.00 0.00 0.00 20
9 1.00 0.88 0.94 17
10 0.67 0.11 0.19 18
11 1.00 0.03 0.07 58
12 0.71 0.39 0.51 51
13 0.00 0.00 0.00 5
14 1.00 0.14 0.25 7
15 0.00 0.00 0.00 5
16 0.00 0.00 0.00 3
17 0.95 0.82 0.88 72
18 0.86 0.30 0.44 20
19 0.00 0.00 0.00 13
20 0.00 0.00 0.00 2
21 0.00 0.00 0.00 8
22 0.33 0.05 0.09 20
23 0.67 0.05 0.09 42
24 0.50 0.17 0.25 6
25 0.80 0.17 0.28 24
27 0.60 0.21 0.32 14
28 0.00 0.00 0.00 12
29 0.00 0.00 0.00 1
30 1.00 0.50 0.67 2
31 0.00 0.00 0.00 2
32 0.00 0.00 0.00 2
33 1.00 0.17 0.29 6
34 0.00 0.00 0.00 26
35 0.00 0.00 0.00 8
36 0.00 0.00 0.00 10
37 0.00 0.00 0.00 8
39 0.00 0.00 0.00 1
40 0.00 0.00 0.00 12
41 0.00 0.00 0.00 2
42 0.00 0.00 0.00 9
43 1.00 0.17 0.29 6
45 0.76 0.46 0.58 28
46 0.00 0.00 0.00 2
47 0.00 0.00 0.00 1
48 0.00 0.00 0.00 1
49 0.00 0.00 0.00 3
51 0.00 0.00 0.00 1
56 0.85 0.50 0.63 46
57 0.00 0.00 0.00 5
59 0.00 0.00 0.00 5
60 0.00 0.00 0.00 1
62 0.00 0.00 0.00 2
63 0.00 0.00 0.00 1
64 0.00 0.00 0.00 1
66 0.00 0.00 0.00 1
67 0.50 0.17 0.25 18
70 0.00 0.00 0.00 1
72 0.58 0.88 0.70 121
73 0.60 0.05 0.10 56
accuracy 0.62 1700
macro avg 0.31 0.14 0.17 1700
weighted avg 0.59 0.62 0.53 1700
Time taken: 54.250050s to run the model
Decision Tree
run_classification(DecisionTreeClassifier(), X_train, X_test, y_train, y_test)
Estimator: Pipeline(steps=[('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
('clf', DecisionTreeClassifier())])
================================================================================
Training accuracy: 95.60%
Testing accuracy: 57.41%
================================================================================
Confusion matrix:
[[634 0 4 ... 0 2 5]
[ 0 1 0 ... 0 1 0]
[ 7 0 11 ... 0 0 2]
...
[ 1 0 0 ... 0 0 0]
[ 2 1 2 ... 0 87 7]
[ 11 0 0 ... 0 28 10]]
================================================================================
Classification report:
precision recall f1-score support
0 0.70 0.83 0.76 761
1 0.17 0.12 0.14 8
2 0.48 0.46 0.47 24
3 0.12 0.20 0.15 5
4 0.32 0.33 0.33 42
5 0.25 0.27 0.26 26
6 0.40 0.30 0.34 20
7 0.29 0.25 0.27 8
8 0.24 0.20 0.22 20
9 0.94 1.00 0.97 17
10 0.15 0.11 0.13 18
11 0.32 0.12 0.17 58
12 0.49 0.39 0.43 51
13 0.00 0.00 0.00 5
14 0.33 0.29 0.31 7
15 0.00 0.00 0.00 5
16 0.67 0.67 0.67 3
17 0.87 0.72 0.79 72
18 0.42 0.40 0.41 20
19 0.14 0.08 0.10 13
20 0.00 0.00 0.00 2
21 0.00 0.00 0.00 8
22 0.47 0.45 0.46 20
23 0.26 0.24 0.25 42
24 0.31 0.67 0.42 6
25 0.40 0.17 0.24 24
27 0.15 0.14 0.15 14
28 0.10 0.08 0.09 12
29 0.00 0.00 0.00 1
30 0.50 0.50 0.50 2
31 0.00 0.00 0.00 2
32 0.00 0.00 0.00 2
33 0.00 0.00 0.00 6
34 0.18 0.12 0.14 26
35 0.00 0.00 0.00 8
36 0.50 0.30 0.37 10
37 0.40 0.25 0.31 8
38 0.00 0.00 0.00 0
39 0.00 0.00 0.00 1
40 0.00 0.00 0.00 12
41 0.00 0.00 0.00 2
42 0.00 0.00 0.00 9
43 0.40 0.33 0.36 6
44 0.00 0.00 0.00 0
45 0.52 0.61 0.56 28
46 0.00 0.00 0.00 2
47 0.00 0.00 0.00 1
48 0.00 0.00 0.00 1
49 0.50 0.33 0.40 3
51 0.00 0.00 0.00 1
55 0.00 0.00 0.00 0
56 0.70 0.46 0.55 46
57 0.50 0.20 0.29 5
58 0.00 0.00 0.00 0
59 0.40 0.40 0.40 5
60 0.00 0.00 0.00 1
62 0.00 0.00 0.00 2
63 0.00 0.00 0.00 1
64 0.00 0.00 0.00 1
66 0.00 0.00 0.00 1
67 0.71 0.28 0.40 18
69 0.00 0.00 0.00 0
70 0.00 0.00 0.00 1
72 0.56 0.72 0.63 121
73 0.30 0.18 0.22 56
accuracy 0.57 1700
macro avg 0.23 0.20 0.21 1700
weighted avg 0.54 0.57 0.55 1700
Time taken: 2.602710s to run the model
Random Forest
run_classification(RandomForestClassifier(n_estimators=100), X_train, X_test, y_train, y_test)
Estimator: Pipeline(steps=[('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
('clf', RandomForestClassifier())])
================================================================================
Training accuracy: 95.60%
Testing accuracy: 62.71%
================================================================================
Confusion matrix:
[[755 0 0 ... 0 0 0]
[ 2 0 0 ... 0 1 0]
[ 15 0 8 ... 0 0 1]
...
[ 1 0 0 ... 0 0 0]
[ 2 1 2 ... 0 100 8]
[ 17 0 0 ... 0 28 10]]
================================================================================
Classification report:
precision recall f1-score support
0 0.60 0.99 0.75 761
1 0.00 0.00 0.00 8
2 0.80 0.33 0.47 24
3 0.00 0.00 0.00 5
4 0.59 0.38 0.46 42
5 0.40 0.08 0.13 26
6 1.00 0.15 0.26 20
7 0.00 0.00 0.00 8
8 0.00 0.00 0.00 20
9 1.00 0.88 0.94 17
10 1.00 0.11 0.20 18
11 1.00 0.02 0.03 58
12 0.80 0.39 0.53 51
13 0.00 0.00 0.00 5
14 1.00 0.14 0.25 7
15 0.00 0.00 0.00 5
16 0.00 0.00 0.00 3
17 0.88 0.88 0.88 72
18 0.83 0.25 0.38 20
19 0.00 0.00 0.00 13
20 0.00 0.00 0.00 2
21 0.00 0.00 0.00 8
22 1.00 0.05 0.10 20
23 1.00 0.17 0.29 42
24 0.31 0.67 0.42 6
25 1.00 0.21 0.34 24
27 0.50 0.14 0.22 14
28 0.50 0.08 0.14 12
29 0.00 0.00 0.00 1
30 1.00 1.00 1.00 2
31 0.00 0.00 0.00 2
32 0.00 0.00 0.00 2
33 0.00 0.00 0.00 6
34 0.67 0.08 0.14 26
35 0.00 0.00 0.00 8
36 1.00 0.20 0.33 10
37 0.00 0.00 0.00 8
39 0.00 0.00 0.00 1
40 0.00 0.00 0.00 12
41 0.00 0.00 0.00 2
42 0.00 0.00 0.00 9
43 1.00 0.33 0.50 6
45 0.68 0.46 0.55 28
46 0.00 0.00 0.00 2
47 0.00 0.00 0.00 1
48 0.00 0.00 0.00 1
49 0.00 0.00 0.00 3
51 0.00 0.00 0.00 1
56 0.85 0.50 0.63 46
57 1.00 0.20 0.33 5
59 0.00 0.00 0.00 5
60 0.00 0.00 0.00 1
62 0.00 0.00 0.00 2
63 0.00 0.00 0.00 1
64 0.00 0.00 0.00 1
66 0.00 0.00 0.00 1
67 0.00 0.00 0.00 18
70 0.00 0.00 0.00 1
72 0.60 0.83 0.69 121
73 0.45 0.18 0.26 56
accuracy 0.63 1700
macro avg 0.36 0.16 0.19 1700
weighted avg 0.62 0.63 0.54 1700
Time taken: 13.355746s to run the model
Observations:
We'll be fine tuning the models and reduce the overfitting in next iteration.